In [ ]:
# Import the libraries we need
from os import getcwd, listdir
from os.path import abspath, dirname, isfile, join, splitext
import pandas as pd
import time
import datetime
import matplotlib.pyplot as plt

%matplotlib inline

In [ ]:
# Get the output directory for the csv file
output_folder_name = 'q4_pollutants_over_time'   # TODO: Update this if needed
output_dir = abspath(join(getcwd(), '..', 'data', output_folder_name))

In [ ]:
# Create a list of csv files in the output directory
csv_files = [join(output_dir, f) for f in listdir(output_dir) if isfile(join(output_dir, f)) and 
             splitext(join(output_dir, f))[1] == '.csv']
print("{} csv files found".format(len(csv_files)))

In [ ]:
# Create a single csv file from the output csv files
output_file = join(output_dir, 'q4_output.csv')

with open(output_file, 'w') as o_file:
    for num in range(0, len(csv_files)):
        with open(csv_files[num], 'r') as f:
            if num == 0:
                lines = f.readlines()
            else:
                lines = f.readlines()[1:]
            for line in lines:
                 o_file.write(line)

In [ ]:
# Create a Pandas DataFrame from the csv file
df = pd.read_csv(output_file,
                 names=['parameter_name', 'state', 'year', 'month', 'arithmetic_mean'],
                 header=0,
                 low_memory=False)
df.head()

In [ ]:
# Get the row and column counts
rows_cols = df.shape
print("Rows: {}".format(rows_cols[0]))
print("Columns: {}".format(rows_cols[1]))

In [ ]:
# Describe the dataframe
df.describe()

In [ ]:
# Check the data types
df.dtypes

California Analysis


In [ ]:
# Create a dataframe for California
state_for_analysis = 'California'
cali_df = df.loc[df['state'] == state_for_analysis]
cali_df.head()

In [ ]:
# How many parameters are measured in California?
cali_measured_params = cali_df.parameter_name.unique()
print(len(cali_measured_params))

In [ ]:
# Count of occurrences of each parameter
cali_df.groupby('parameter_name')['state'].count()

In [ ]:
# Verify the ccurrence count of 1122-Tetrachloroethane in the Cali dataframe
df2 = cali_df.loc[cali_df['parameter_name'] == '1122-Tetrachloroethane']
len(df2)

In [ ]:
# Add a date column to the dataframe by combining the month and year columns
pd.options.mode.chained_assignment = None

def create_datetime(year, month):
    return datetime.datetime(year=year, month=month, day=1)

cali_df['observation_date'] = cali_df.apply(lambda x: create_datetime(x['year'], x['month']), axis=1)
cali_df.head()

In [ ]:
# Drop the state, year and month columns
cali_df.drop('state', axis=1, inplace=True)
cali_df.drop('year', axis=1, inplace=True)
cali_df.drop('month', axis=1, inplace=True)

# Set the observation_date column as the index to we can create some charty goodness
cali_df.set_index('observation_date', inplace=True)

cali_df.head()

In [ ]:
# Chart this bad boy!
parameter_to_chart = '1122-Tetrachloroethane'
cali_df.loc[cali_df['parameter_name'] == parameter_to_chart].plot()

In [ ]:
# Want a histogram?! Okay!
cali_df.loc[cali_df['parameter_name'] == parameter_to_chart].hist(alpha=0.5)

In [ ]:
# MOAR CHARTS!
parameter_to_chart = 'Tetrachloroethylene'
cali_df.loc[cali_df['parameter_name'] == parameter_to_chart].plot()

In [ ]: